This presentation introduces a complete RNA-Seq analysis workflow using the DESeq2 package. You will learn how to:
The pasilla dataset comes from an experiment investigating the effect of siRNA knockdown of the pasilla gene in Drosophila.
# Load the count matrix and sample annotation.
# Adjust file paths as needed.
countData <- read.delim("../data/deseq/pasilla_gene_counts.tsv", row.names = 1)
head(countData) untreated1 untreated2 untreated3 untreated4 treated1 treated2
FBgn0000003 0 0 0 0 0 0
FBgn0000008 92 161 76 70 140 88
FBgn0000014 5 1 0 0 4 0
FBgn0000015 0 2 1 2 1 0
FBgn0000017 4664 8714 3564 3150 6205 3072
FBgn0000018 583 761 245 310 722 299
treated3
FBgn0000003 1
FBgn0000008 70
FBgn0000014 0
FBgn0000015 0
FBgn0000017 3334
FBgn0000018 308
colData <- read.csv("../data/deseq/pasilla_sample_annotation.csv", row.names = 1)
# Ensure the 'condition' variable is a factor.
colData$condition <- as.factor(colData$condition)
head(colData) condition type number.of.lanes total.number.of.reads
treated1 treated single-read 5 35158667
treated2 treated paired-end 2 12242535 (x2)
treated3 treated paired-end 2 12443664 (x2)
untreated1 untreated single-read 2 17812866
untreated2 untreated single-read 6 34284521
untreated3 untreated paired-end 2 10542625 (x2)
exon.counts
treated1 15679615
treated2 15620018
treated3 12733865
untreated1 14924838
untreated2 20764558
untreated3 10283129
# It is critical that the order of columns in countData matches the rows in colData.
all(rownames(colData) == colnames(countData)) # Should be TRUE[1] FALSE
[1] TRUE
condition type number.of.lanes total.number.of.reads
untreated1 untreated single-read 2 17812866
untreated2 untreated single-read 6 34284521
untreated3 untreated paired-end 2 10542625 (x2)
untreated4 untreated paired-end 2 12214974 (x2)
treated1 treated single-read 5 35158667
treated2 treated paired-end 2 12242535 (x2)
treated3 treated paired-end 2 12443664 (x2)
exon.counts
untreated1 14924838
untreated2 20764558
untreated3 10283129
untreated4 11653031
treated1 15679615
treated2 15620018
treated3 12733865
# Create a DESeqDataSet object using the count data and sample annotation.
dds <- DESeqDataSetFromMatrix(countData = countData,
colData = colData,
design = ~ condition)
# Add gene names as metadata to the DESeqDataSet.
metadata <- data.frame(gene = rownames(countData))
mcols(dds) <- DataFrame(mcols(dds), metadata)
print(dds)class: DESeqDataSet
dim: 14599 7
metadata(1): version
assays(1): counts
rownames(14599): FBgn0000003 FBgn0000008 ... FBgn0261574 FBgn0261575
rowData names(1): gene
colnames(7): untreated1 untreated2 ... treated2 treated3
colData names(5): condition type number.of.lanes total.number.of.reads
exon.counts
# Run the full DESeq2 pipeline:
# 1. Normalization, 2. Dispersion estimation, 3. Model fitting, and 4. Hypothesis testing.
dds <- DESeq(dds)
# Extract results with the default FDR threshold (0.1).
res <- results(dds)
summary(res)
out of 11199 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 513, 4.6%
LFC < 0 (down) : 534, 4.8%
outliers [1] : 1, 0.0089%
low counts [2] : 2606, 23%
(mean count < 5)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
# Extract results using a stricter FDR threshold (alpha = 0.05).
res05 <- results(dds, alpha = 0.05)
summary(res05)
out of 11199 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 408, 3.6%
LFC < 0 (down) : 432, 3.9%
outliers [1] : 1, 0.0089%
low counts [2] : 2823, 25%
(mean count < 6)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
# RNA-Seq count data have wide ranges and variance increases with the mean.
# VST transforms the data so that variance is approximately constant, making it
# better suited for visualizations like PCA and heatmaps.
vsd <- vst(dds, blind = FALSE)
pcaData <- plotPCA(vsd, intgroup = "condition", returnData = TRUE)
pcaData$type <- colData$type # Add sequencing type for extra context.
percentVar <- round(100 * attr(pcaData, "percentVar"))# Compute a distance matrix from the VST data and perform MDS.
dists <- dist(t(assay(vsd)))
mds <- cmdscale(dists)
mdsData <- data.frame(MDS1 = mds[, 1],
MDS2 = mds[, 2],
condition = colData$condition,
type = colData$type)
# Static MDS plot with ggplot2.
p_mds <- ggplot(mdsData, aes(MDS1, MDS2, color = condition, shape = type)) +
geom_point(size = 3) +
ggtitle("MDS Plot of pasilla Samples") +
theme_minimal()
p_mds# Create a volcano plot to display the relationship between fold change and significance.
volcanoData <- as.data.frame(res)
volcanoData$gene <- rownames(volcanoData)
p_volcano <- ggplot(volcanoData, aes(x = log2FoldChange, y = -log10(padj), text = gene)) +
geom_point(alpha = 0.4) +
ggtitle("Volcano Plot") +
xlab("Log2 Fold Change") +
ylab("-Log10 Adjusted P-Value") +
theme_minimal()Sometimes technical variables—like sequencing type (single-read vs. paired-end)—can affect your results. To account for this, include the technical variable as a covariate in your design:
# Adjust the 'type' variable to remove extra characters and ensure it's a factor.
colData$type <- as.factor(gsub("-.+","", colData$type))
dds2 <- DESeqDataSetFromMatrix(countData = countData,
colData = colData,
design = ~ type + condition)
dds2$condition <- relevel(dds2$condition, ref = "untreated")
dds2 <- DESeq(dds2)
# Extract results accounting for technical variation.
res2 <- results(dds2, contrast = c("condition", "untreated", "treated"))
summary(res2)
out of 12359 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 717, 5.8%
LFC < 0 (down) : 613, 5%
outliers [1] : 0, 0%
low counts [2] : 3560, 29%
(mean count < 4)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
# Normalize data for heatmap visualization.
ntd2 <- normTransform(dds2)
library(pheatmap)
df <- as.data.frame(colData(dds2)[, c("condition", "type")])
# Heatmap with hierarchical clustering for differentially expressed genes (padj < 0.01).
select <- res2$padj < 0.001
select[is.na(select)] <- FALSE
pheatmap(assay(ntd2)[select, ], cluster_rows = TRUE, show_rownames = FALSE,
cluster_cols = TRUE, annotation_col = df, scale = "row",
color = colorRampPalette(c("blue", "white", "red"))(100))# Retrieve gene annotations using the org.Dm.eg.db package.
# This provides gene symbols and descriptions based on FlyBase IDs.
library(org.Dm.eg.db)
library(AnnotationDbi)
fly_ids <- rownames(countData)
fly_genes <- select(org.Dm.eg.db, keys = fly_ids, columns = c("SYMBOL", "GENENAME"),
keytype = "FLYBASE")
# Merge annotation data with differential expression results.
res2_05 <- results(dds2, alpha = 0.05, contrast = c("condition", "untreated", "treated"))
res2_05_reordered <- res2_05[order(res2_05$padj), ]
res2_05_annotated <- merge(as.data.frame(res2_05_reordered),
fly_genes, by.x = "row.names",
by.y = "FLYBASE", all.x = TRUE)
res2_05_annotated <- res2_05_annotated[order(res2_05_annotated$padj), ]
write.csv(res2_05_annotated, "../work_dir/deseq2_results_annotated.csv", row.names = FALSE)| Row.names | baseMean | log2FoldChange | lfcSE | stat | pvalue | padj | SYMBOL | GENENAME | |
|---|---|---|---|---|---|---|---|---|---|
| 612 | FBgn0003360 | 4.3e+03 | 3.1e+00 | 1.1e-01 | 2.9e+01 | 0e+00 | 0e+00 | sesB | stress-sensitive B |
| 2624 | FBgn0026562 | 4.4e+04 | 2.5e+00 | 8.7e-02 | 2.8e+01 | 0e+00 | 0e+00 | SPARC | Secreted protein, acidic, cysteine-rich |
| 9831 | FBgn0039155 | 7.3e+02 | 4.6e+00 | 1.7e-01 | 2.8e+01 | 0e+00 | 0e+00 | Kal1 | Kallmann syndrome 1 |
| 2366 | FBgn0025111 | 1.5e+03 | -2.9e+00 | 1.0e-01 | -2.7e+01 | 0e+00 | 0e+00 | Ant2 | Adenine nucleotide translocase 2 |
| 3192 | FBgn0029167 | 3.7e+03 | 2.2e+00 | 9.6e-02 | 2.3e+01 | 0e+00 | 0e+00 | Hml | Hemolectin |
| 6948 | FBgn0035085 | 6.4e+02 | 2.6e+00 | 1.3e-01 | 2.0e+01 | 0e+00 | 0e+00 | CG3770 | uncharacterized protein |
| 10305 | FBgn0039827 | 2.6e+02 | 4.2e+00 | 2.2e-01 | 1.9e+01 | 0e+00 | 0e+00 | CG1544 | uncharacterized protein |
| 6711 | FBgn0034736 | 2.3e+02 | 3.5e+00 | 2.1e-01 | 1.7e+01 | 0e+00 | 0e+00 | gas | gasoline |
| 3411 | FBgn0029896 | 4.9e+02 | 2.4e+00 | 1.5e-01 | 1.7e+01 | 0e+00 | 0e+00 | CG3168 | uncharacterized protein |
| 30 | FBgn0000071 | 3.4e+02 | -2.6e+00 | 1.6e-01 | -1.7e+01 | 0e+00 | 0e+00 | Ama | Amalgam |
| 9598 | FBgn0038832 | 3.0e+02 | 2.5e+00 | 1.8e-01 | 1.4e+01 | 0e+00 | 0e+00 | CG15695 | uncharacterized protein |
| 6815 | FBgn0034897 | 8.9e+02 | 1.9e+00 | 1.4e-01 | 1.4e+01 | 0e+00 | 0e+00 | Sesn | Sestrin |
| 10415 | FBgn0040091 | 1.1e+03 | 1.6e+00 | 1.2e-01 | 1.3e+01 | 0e+00 | 0e+00 | Ugt317A1 | UDP-glycosyltransferase family 317 member A1 |
| 7020 | FBgn0035189 | 2.2e+02 | -2.9e+00 | 2.2e-01 | -1.3e+01 | 0e+00 | 0e+00 | CG9119 | uncharacterized protein |
| 6495 | FBgn0034434 | 1.1e+02 | 3.7e+00 | 2.8e-01 | 1.3e+01 | 0e+00 | 0e+00 | NA | NA |
| 8839 | FBgn0037754 | 2.7e+02 | 2.3e+00 | 1.8e-01 | 1.3e+01 | 0e+00 | 0e+00 | CG8500 | uncharacterized protein |
| 2709 | FBgn0027279 | 3.0e+03 | 1.2e+00 | 9.2e-02 | 1.3e+01 | 0e+00 | 0e+00 | l(1)G0196 | lethal (1) G0196 |
| 14579 | FBgn0261552 | 5.1e+03 | 1.9e+00 | 1.5e-01 | 1.3e+01 | 0e+00 | 0e+00 | ps | pasilla |
| 304 | FBgn0001226 | 1.3e+03 | -1.7e+00 | 1.3e-01 | -1.3e+01 | 0e+00 | 0e+00 | Hsp27 | Heat shock protein 27 |
| 10421 | FBgn0040099 | 9.1e+02 | 1.7e+00 | 1.4e-01 | 1.3e+01 | 0e+00 | 0e+00 | lectin-28C | lectin-28C |